Predicting the worlds offline population

First Model Training and Evaluation

(Random Forest on Brazil)

In [1]:
# load libraries
import yaml
import lightgbm
import matplotlib.pyplot as plt
import pandas as pd
import geopandas
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
/opt/anaconda3/lib/python3.8/site-packages/geopandas/_compat.py:106: UserWarning: The Shapely GEOS version (3.8.0-CAPI-1.13.1 ) is incompatible with the GEOS version PyGEOS was compiled with (3.9.1-CAPI-1.14.2). Conversions between both will be slow.
  warnings.warn(
In [ ]:
one change

Load and prepare data

In [2]:
import os
import pandas as pd
os.chdir('/home/desktop3/itu')
path = os.getcwd()
print(path)
/home/desktop3/itu
In [3]:
# read in (yaml) configs
with open(path + '/conf/model_config.yaml', 'r') as conf:
    model_config = yaml.safe_load(conf)

# import data
dataset = model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)
# subset for faster trial and error
#dataset = dataset.iloc[0:1000,:]

# define predictors and target
predictor   =  model_config['meta']['predictors']
target = ['A4A_right']
In [4]:
dataset.columns
Out[4]:
Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'School',
       'source_school_id', 'geometry', 'index_right', 'CLASSEA_right',
       'CLASSEB_right', 'CLASSEC_right', 'CLASSEDE_right', 'A5A1_right',
       'A5A2_right', 'A5A3_right', 'A5A4_right', 'A5A5_right', 'A5A6_right',
       'A5A7_right', 'A5A8_right', 'A5A9_right', 'A5A10_right', 'A7A1_right',
       'A4A_right', 'lon', 'lat', 'range', 'samples', 'avg_d_kbps',
       'avg_u_kbps', 'avg_d_kbps3', 'avg_u_kbps3', 'latitude', 'longitude',
       'mean_GHM', 'avg_rad_mean', '20_avg_rad_mean', '19_avg_rad_mean',
       'covid_avg_rad_mean', 'cf_cvg_mean', '20_cf_cvg_mean', '19_cf_cvg_mean',
       'covid_cf_cvg_mean', 'viirs_slope_yr', 'viirs_yr',
       'cf_cvg_viirs_slope_yr', 'cf_cvg_viirs_yr', 'viirs_slope_month',
       'viirs_slope_month_cf_cvg', 'ghsl_mean', 'estimate_dau', 'estimate_mau',
       'estimate_ready', 'enum_area', 'population', 'pop_norm', 'mean_NDVI',
       'NDVI_19', 'NDVI_14', 'NDVI_Slope', 'NDVI_Diff'],
      dtype='object')
In [ ]:
dataset
In [5]:
# prepare data
X = dataset[predictor]
y = dataset[target]
print('X Shape:', X.shape)
print('y Shape:', y.shape)
   
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = model_config['parameter']['test_size'], 
                                                    random_state = 42)

print('X_train, X_test, y_train, y_test shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("size of training dataset = ", len(X_train))
print("size of test dataset = ", len(X_test))
X Shape: (11596, 12)
y Shape: (11596, 1)
X_train, X_test, y_train, y_test shapes: (8117, 12) (3479, 12) (8117, 1) (3479, 1)
size of training dataset =  8117
size of test dataset =  3479
In [6]:
import seaborn as sns

sns.set(rc={'figure.figsize':(15,15)})

sns.heatmap(
    X.corr(), 
    cmap="coolwarm", 
    annot=True, 
    vmin=-1, 
    vmax=1,
)
Out[6]:
<AxesSubplot:>

Training

Prepare model tuning

In [7]:
# create inner and outer cross-validation sets
inner_cv = KFold(n_splits = model_config['parameter']['inner_cv'], shuffle=True)

# define parameter grid
parameters = {"n_estimators": [4, 5, 6, 15], "max_depth": [5, 10, 15, 20, 50]}

# define model class to use
model = RandomForestRegressor(random_state=42)

#Create custom scoring
from sklearn.metrics import make_scorer

def custom_eval_metric(y_true, y_pred):
    #errors_low_ytest = abs(y_pred[np.asarray(y_true).flatten()<0.3] - np.asarray(y_true[np.asarray(y_true).flatten()<0.3]).flatten())
    errors_low=abs(y_pred[y_pred<model_config['parameter']['threshold']] - np.asarray(y_true[y_pred<model_config['parameter']['threshold']]).flatten())
    return np.mean(errors_low)
    
custom_scorer = make_scorer(custom_eval_metric, greater_is_better = False)

# define grid search
search = RandomizedSearchCV(model, parameters, cv = inner_cv, random_state = 42,
verbose = 2, n_iter = model_config['parameter']['iterations'],
scoring = custom_scorer)

# search = RandomizedSearchCV(model, parameters, cv = 5, random_state = 42, 
#                             verbose = 2, n_iter = 50)
#others: 'max_error', 'neg_mean_absolute_percentage_error',

Model tuning

In [8]:
# find best parameters
search.fit(X_train, y_train.values.ravel())
Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=5, n_estimators=4; total time=   0.1s
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.3s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.4s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.4s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.3s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.3s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.4s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.3s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.4s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.4s
[CV] END .......................max_depth=50, n_estimators=5; total time=   0.3s
[CV] END ......................max_depth=20, n_estimators=15; total time=   0.9s
[CV] END ......................max_depth=20, n_estimators=15; total time=   0.9s
[CV] END ......................max_depth=20, n_estimators=15; total time=   1.0s
[CV] END ......................max_depth=20, n_estimators=15; total time=   0.9s
[CV] END ......................max_depth=20, n_estimators=15; total time=   1.0s
[CV] END ......................max_depth=20, n_estimators=15; total time=   1.0s
[CV] END ......................max_depth=20, n_estimators=15; total time=   0.9s
[CV] END ......................max_depth=20, n_estimators=15; total time=   1.0s
[CV] END ......................max_depth=20, n_estimators=15; total time=   1.0s
[CV] END ......................max_depth=20, n_estimators=15; total time=   1.0s
/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py:918: UserWarning: One or more of the test scores are non-finite: [        nan -0.06902642 -0.07076771]
  warnings.warn(
Out[8]:
RandomizedSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
                   estimator=RandomForestRegressor(random_state=42), n_iter=3,
                   param_distributions={'max_depth': [5, 10, 15, 20, 50],
                                        'n_estimators': [4, 5, 6, 15]},
                   random_state=42,
                   scoring=make_scorer(custom_eval_metric, greater_is_better=False),
                   verbose=2)

Tuning results

In [9]:
# all results
search.cv_results_
# best results
best_parameter = search.best_params_
print(best_parameter)
{'n_estimators': 5, 'max_depth': 50}
In [34]:
print(search.cv_results_)
{'mean_fit_time': array([0.08813989, 0.10895729, 0.13038912, 0.32037816, 0.15418773,
       0.19210937, 0.23063242, 0.56688457, 0.20202267, 0.25262635,
       0.30257347, 0.7467335 , 0.22977843, 0.28757741, 0.34532409,
       0.85172727, 0.24884441, 0.31219084, 0.37351534, 0.91822562]), 'std_fit_time': array([0.00121763, 0.00076212, 0.00074791, 0.00274973, 0.00234037,
       0.00193844, 0.00415273, 0.00494929, 0.00498716, 0.00464347,
       0.00549127, 0.01081706, 0.0089105 , 0.00768208, 0.00853397,
       0.018693  , 0.01598951, 0.01401317, 0.0157709 , 0.03339569]), 'mean_score_time': array([0.00226464, 0.00244961, 0.00243504, 0.00309165, 0.00249352,
       0.0026006 , 0.00275135, 0.00370114, 0.00278285, 0.00289738,
       0.00303259, 0.00447886, 0.00288696, 0.00310669, 0.003283  ,
       0.00512266, 0.00306313, 0.00328689, 0.00356166, 0.00578828]), 'std_score_time': array([2.94628261e-05, 1.90165959e-04, 3.45061868e-05, 4.95329216e-05,
       5.56981930e-05, 2.69151345e-05, 5.90351556e-05, 6.40793610e-05,
       2.06838346e-04, 8.87990790e-05, 4.20566859e-05, 6.96304756e-05,
       5.20332695e-05, 1.29397835e-04, 5.11034471e-05, 9.66675472e-05,
       3.65746922e-05, 3.71709475e-05, 3.91228906e-05, 7.25759470e-05]), 'param_n_estimators': masked_array(data=[4, 5, 6, 15, 4, 5, 6, 15, 4, 5, 6, 15, 4, 5, 6, 15, 4,
                   5, 6, 15],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15, 20, 20, 20,
                   20, 50, 50, 50, 50],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 4, 'max_depth': 5}, {'n_estimators': 5, 'max_depth': 5}, {'n_estimators': 6, 'max_depth': 5}, {'n_estimators': 15, 'max_depth': 5}, {'n_estimators': 4, 'max_depth': 10}, {'n_estimators': 5, 'max_depth': 10}, {'n_estimators': 6, 'max_depth': 10}, {'n_estimators': 15, 'max_depth': 10}, {'n_estimators': 4, 'max_depth': 15}, {'n_estimators': 5, 'max_depth': 15}, {'n_estimators': 6, 'max_depth': 15}, {'n_estimators': 15, 'max_depth': 15}, {'n_estimators': 4, 'max_depth': 20}, {'n_estimators': 5, 'max_depth': 20}, {'n_estimators': 6, 'max_depth': 20}, {'n_estimators': 15, 'max_depth': 20}, {'n_estimators': 4, 'max_depth': 50}, {'n_estimators': 5, 'max_depth': 50}, {'n_estimators': 6, 'max_depth': 50}, {'n_estimators': 15, 'max_depth': 50}], 'split0_test_score': array([-0.19072819, -0.20517148, -0.18986702, -0.19244839, -0.14506556,
       -0.1463394 , -0.14456523, -0.1489188 , -0.12531056, -0.12680703,
       -0.10941157, -0.12475329, -0.11310236, -0.10747363, -0.10153349,
       -0.10115015, -0.10896397, -0.10276502, -0.10125235, -0.10990093]), 'split1_test_score': array([-0.23318137, -0.19211586, -0.20448478, -0.19299485, -0.14614864,
       -0.14175825, -0.14459321, -0.13064722, -0.13029603, -0.11563331,
       -0.11611587, -0.13481982, -0.1039997 , -0.10758187, -0.10372677,
       -0.10054803, -0.10404774, -0.09959058, -0.11798948, -0.10186819]), 'split2_test_score': array([-0.13770409, -0.14665737, -0.14920277, -0.14243024, -0.12497957,
       -0.12571002, -0.11549774, -0.11166972, -0.12160305, -0.11768313,
       -0.10987671, -0.10723281, -0.1138926 , -0.11700219, -0.1005808 ,
       -0.0974701 , -0.10789334, -0.09994603, -0.10176461, -0.09225987]), 'split3_test_score': array([-0.13230411, -0.12485584, -0.12330688, -0.14085185, -0.09303868,
       -0.09159938, -0.08883148, -0.10365933, -0.07386295, -0.07091054,
       -0.08522872, -0.09585708, -0.06010213, -0.06742632, -0.07341028,
       -0.07818947, -0.08039936, -0.07616072, -0.07869498, -0.06955068]), 'split4_test_score': array([-0.14360118, -0.14646636, -0.15216426, -0.13941436, -0.10706732,
       -0.10510289, -0.09814609, -0.1153842 , -0.09472896, -0.08662283,
       -0.08750869, -0.11198119, -0.09361446, -0.0725794 , -0.0733521 ,
       -0.08146729, -0.08922124, -0.07044292, -0.06985425, -0.07811939]), 'split5_test_score': array([-0.15520802, -0.1660368 , -0.1661358 , -0.1569005 , -0.10563431,
       -0.08561658, -0.09087072, -0.09473817, -0.0847562 , -0.08780471,
       -0.08629795, -0.09310759, -0.10514778, -0.09242068, -0.09156753,
       -0.0821747 , -0.1030752 , -0.09097687, -0.09328188, -0.08359477]), 'split6_test_score': array([-0.20152554, -0.20649447, -0.19026737, -0.20397956, -0.12777746,
       -0.14988862, -0.13521879, -0.14860495, -0.1065318 , -0.11888217,
       -0.10462828, -0.13428361, -0.10015147, -0.0992974 , -0.10279465,
       -0.12032956, -0.09064919, -0.10179145, -0.10342636, -0.10984158]), 'split7_test_score': array([-0.13158693, -0.14221263, -0.14247157, -0.15782974, -0.10846975,
       -0.10116474, -0.11800691, -0.11132585, -0.09801948, -0.09031838,
       -0.08675236, -0.09522213, -0.09327541, -0.07605147, -0.08106612,
       -0.09887943, -0.09862819, -0.09355497, -0.09523759, -0.08707251]), 'split8_test_score': array([-0.13256927, -0.13897914, -0.14724508, -0.1540143 , -0.10582553,
       -0.10148257, -0.11168001, -0.09251763, -0.0960093 , -0.09937651,
       -0.09814551, -0.08525328, -0.09802617, -0.09226167, -0.09260593,
       -0.0812287 , -0.08290606, -0.08516679, -0.08486447, -0.06976947]), 'split9_test_score': array([-0.15011117, -0.15715379, -0.15764257, -0.14103836, -0.16120339,
       -0.14765171, -0.13780396, -0.12170482, -0.13830735, -0.14065883,
       -0.17262687, -0.13158446, -0.12587578, -0.13763621, -0.13339448,
       -0.12490534, -0.11880257, -0.134939  , -0.13455437, -0.11228009]), 'mean_test_score': array([-0.16085199, -0.16261437, -0.16227881, -0.16219021, -0.12252102,
       -0.11963142, -0.11852141, -0.11791707, -0.10694257, -0.10546974,
       -0.10565925, -0.11140953, -0.10071879, -0.09697308, -0.09540322,
       -0.09663428, -0.09845868, -0.09553344, -0.09809203, -0.09142575]), 'std_test_score': array([0.03351184, 0.02750343, 0.02402628, 0.02352997, 0.02113264,
       0.02400995, 0.0203127 , 0.01885158, 0.02004106, 0.02062571,
       0.02482452, 0.01788055, 0.01659312, 0.02057077, 0.01682162,
       0.01555259, 0.01174805, 0.01678442, 0.01776855, 0.01558835]), 'rank_test_score': array([17, 20, 19, 18, 16, 15, 14, 13, 11,  9, 10, 12,  8,  5,  2,  4,  7,
        3,  6,  1], dtype=int32)}
In [18]:
print(search.best_score_)
-0.08472689206229156

Fit model with best parameters

In [17]:
# define model class to use
model = RandomForestRegressor(random_state = 42,
                              n_estimators = 20,
                              max_depth = 15,
                              )

# find best parameters
model.fit(X_train, y_train.values.ravel())
Out[17]:
RandomForestRegressor(max_depth=15, n_estimators=20, random_state=42)
In [20]:
y_test.values.ravel()
Out[20]:
array([0.60387938, 0.724975  , 0.69283101, ..., 1.        , 0.87262107,
       0.30652342])

Pickling the best model (also saved within mlflow as an artifact)

In [21]:
import pickle
In [41]:
#changing directory
os.chdir('/home/desktop3/itu/models/')
path = os.getcwd()
print(path)
/home/desktop3/itu/models
In [42]:
#Pickle the model
# save the model to disk
filename = 'RF_model_NDVI_.3Threshold.sav'
pickle.dump(model, open(filename, 'wb'))

Evaluation

In [18]:
# predict holdout
#pred = model.predict(X_test)
pred = model.predict(X_test)

# Absolute error
errors = abs(pred - y_test.iloc[:,0].to_numpy())
avg_error = np.mean(errors)

#Low tail error
errors_low = abs(pred[pred<model_config['parameter']['threshold']] - np.asarray(y_test[pred<model_config['parameter']['threshold']]).flatten())

#Low tail error
errors_low_ytest = abs(pred[np.asarray(y_test).flatten()<model_config['parameter']['threshold']]
                       - np.asarray(y_test[np.asarray(y_test).flatten()<model_config['parameter']['threshold']]).flatten())

#avg error
avg_error_low = np.mean(errors_low)


#avg error
avg_error_low_ytest = np.mean(errors_low_ytest)

#standard deviation
stan_dev_low= np.std(errors_low)



print('errors: ', errors)
print('avg error: ', avg_error)
# print('Just the lower errors: ', errors_low)
print('Mean lower error: ', avg_error_low)
print('Mean ytest lower error: ', avg_error_low_ytest)
# print('y test error: ', errors_low_ytest)
print('Standard Dev of Low Error: ', stan_dev_low)
errors:  [2.67990987e-05 3.68616931e-02 5.45930043e-02 ... 7.66906388e-02
 5.61017184e-02 1.49279420e-01]
avg error:  0.08756334974086423
Mean lower error:  0.08048511651679148
Mean ytest lower error:  0.2136455782851406
Standard Dev of Low Error:  0.1057313730593919

Examining the errors

In [2]:
#Checking to see the raw amount of school
print('The amount of schools predicted to be lower than 30%: ', len(pred[pred<.3]))
print('The amount of schools that are actually below 30%: ', len(y_test[y_test['A4A_right']<0.3]))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-4ef7e1c1e5a3> in <module>
      1 #Checking to see the raw amount of school
----> 2 print('The amount of schools predicted to be lower than 30%: ', len(pred[pred<.3]))
      3 print('The amount of schools that are actually below 30%: ', len(y_test[y_test['A4A_right']<0.3]))

NameError: name 'pred' is not defined
In [47]:
#Creating y_test dataframe to merge back
y_test['Predictions']= pred.tolist()
y_test['Errors']= abs(y_test['A4A_right']-y_test['Predictions'])
y_test
Out[47]:
A4A_right Predictions Errors
908 0.603879 0.517828 8.605169e-02
6372 0.724975 0.647398 7.757738e-02
3912 0.692831 0.724958 3.212736e-02
3090 0.423048 0.423048 5.551115e-17
468 0.868368 0.868368 1.110223e-16
... ... ... ...
7675 0.482814 0.744278 2.614649e-01
1346 0.751424 0.661508 8.991594e-02
10813 1.000000 1.000000 0.000000e+00
453 0.872621 0.847675 2.494630e-02
11116 0.306523 0.306523 0.000000e+00

3479 rows × 3 columns

In [48]:
#Only values where ground truth less than .3
onlygtvalues = y_test.loc[y_test['A4A_right']<.3]
In [49]:
# Merge y_test back into main df
df_merge = pd.merge(y_test, dataset, how= "inner", left_index=True, right_index=True)
In [50]:
High_Error_Schools = df_merge.loc[df_merge['Errors']>.3]
High_Error_Schools.shape
Out[50]:
(86, 66)
In [51]:
High_Pred_Schools = df_merge.loc[df_merge['Predictions']>.7]
High_Pred_Schools
Out[51]:
A4A_right_x Predictions Errors Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1.1 Unnamed: 0.1.1.1 Unnamed: 0.1.1.1.1 Unnamed: 0.1.1.1.1.1 School ... estimate_mau estimate_ready enum_area population pop_norm mean_NDVI NDVI_19 NDVI_14 NDVI_Slope NDVI_Diff
3912 0.692831 0.724958 3.212736e-02 3912 4000 4000 4000 4000 116117 POINT (-43.2623 -22.6758) ... 210000 True 1100.0 6193.8090 0.694889 5441.129327 5547.967843 5441.129327 17.806419 106.838516
468 0.868368 0.868368 1.110223e-16 468 489 489 489 489 9398 POINT (-60.7103 2.8155) ... 200000 True 81.0 5955.4740 0.693330 2663.136086 2494.298875 2663.136086 -28.139535 -168.837211
2210 0.927887 0.709794 2.180935e-01 2210 2290 2290 2290 2290 66051 POINT (-34.8874 -8.0045) ... 710000 True 527.0 12290.8070 0.734786 4246.875887 4420.134189 4246.875887 30.752384 184.514304
11194 0.763314 0.790506 2.719194e-02 11194 11330 11330 11330 11330 160998 POINT (-54.1272 -31.3429) ... 70000 True 1815.0 1757.7137 0.665861 6377.259839 5797.402887 6377.259839 -96.642825 -579.856952
7995 0.564533 0.834076 2.695425e-01 7995 8120 8120 8120 8120 107259 POINT (-43.3056 -15.8193) ... 15000 True 912.0 2973.6948 0.673818 4902.539607 5857.266927 4902.539607 159.121220 954.727321
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4687 0.516259 0.704238 1.879791e-01 4687 4782 4782 4782 4782 119706 POINT (-41.9769 -22.5516) ... 67000 True 1144.0 6553.9233 0.697246 5305.194469 5825.891082 5305.194469 88.239587 529.437524
12 0.885899 0.942950 5.705032e-02 12 12 12 12 12 52 POINT (-63.8808 -8.7913) ... 210000 True 4.0 7974.1675 0.706539 4499.841107 5049.830363 4499.841107 91.664876 549.989256
7675 0.482814 0.744278 2.614649e-01 7675 7794 7794 7794 7794 123031 POINT (-46.7729 -22.7107) ... 48000 True 1207.0 3238.0156 0.675548 6516.108673 7172.961003 6516.108673 109.475388 656.852331
10813 1.000000 1.000000 0.000000e+00 10813 10947 10947 10947 10947 14777 POINT (-52.2193 -3.2048) ... 48000 True 104.0 4612.9165 0.684544 3562.318893 3025.336109 3562.318893 -89.497131 -536.982783
453 0.872621 0.847675 2.494630e-02 453 474 474 474 474 8800 POINT (-60.7027 2.8283) ... 170000 True 77.0 3891.6443 0.679825 2529.567348 2555.578316 2529.567348 4.335161 26.010968

2184 rows × 66 columns

In [52]:
df_merge.shape
Out[52]:
(3479, 66)
In [53]:
#Creating a geodataframe
from shapely import wkt

df_merge['School'] = geopandas.GeoSeries.from_wkt(df_merge['School'])
gdf = geopandas.GeoDataFrame(df_merge, geometry='School')
In [54]:
#Subset by just low Ground truth values
Low_GT = df_merge.loc[df_merge['A4A_right_x']<.3]
Low_pred = df_merge.loc[df_merge['Predictions']<.3]
In [55]:
#Low_GT['School'] = geopandas.GeoSeries.from_wkt(Low_GT['School'])
gdf_GT = geopandas.GeoDataFrame(Low_GT, geometry='School')
In [56]:
#Low_pred['School'] = geopandas.GeoSeries.from_wkt(Low_pred['School'])
gdf_Low_pred = geopandas.GeoDataFrame(Low_pred, geometry='School')
In [57]:
Low_GT.loc[Low_GT['Predictions']>.3]
Out[57]:
A4A_right_x Predictions Errors Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1.1 Unnamed: 0.1.1.1 Unnamed: 0.1.1.1.1 Unnamed: 0.1.1.1.1.1 School ... estimate_mau estimate_ready enum_area population pop_norm mean_NDVI NDVI_19 NDVI_14 NDVI_Slope NDVI_Diff
9764 0.181060 0.714530 0.533470 9764 9896 9896 9896 9896 37253 POINT (-42.30970 -4.47430) ... 1000 True 274.0 502.014650 0.657644 7601.789250 7763.386968 7601.789250 26.932953 161.597718
1157 0.245145 0.362012 0.116867 1157 1203 1203 1203 1203 26287 POINT (-44.84310 -2.96100) ... 2000 True 223.0 193.319730 0.655624 6861.303001 6369.010173 6861.303001 -82.048805 -492.292828
2391 0.210017 0.375865 0.165848 2391 2471 2471 2471 2471 68821 POINT (-37.20220 -9.59030) ... 5900 True 605.0 43.519924 0.654644 5042.961132 7153.776280 5042.961132 351.802525 2110.815149
3737 0.127338 0.713276 0.585938 3737 3825 3825 3825 3825 119721 POINT (-43.76140 -22.86800) ... 85000 True 1111.0 3966.650600 0.680315 6001.651350 6426.896798 6001.651350 70.874241 425.245448
1883 0.241445 0.332859 0.091414 1883 1950 1950 1950 1950 59292 POINT (-35.18730 -6.80230) ... 10000 True 471.0 17.080410 0.654471 6470.051127 6885.910231 6470.051127 69.309851 415.859105
8694 0.277852 0.723914 0.446063 8694 8820 8820 8820 8820 43026 POINT (-42.78760 -5.09460) ... 360000 True 292.0 4176.862000 0.681691 3776.997918 3426.029720 3776.997918 -58.494700 -350.968198
9141 0.187138 0.584740 0.397602 9141 9271 9271 9271 9271 101327 POINT (-45.51060 -21.37230) ... 16000 True 993.0 3356.569000 0.676323 6543.426068 6968.656303 6543.426068 70.871706 425.230235
7054 0.215730 0.358710 0.142980 7054 7172 7172 7172 7172 158341 POINT (-51.15230 -30.04390) ... 520000 True 1876.0 8572.529000 0.710455 4912.328888 4964.727510 4912.328888 8.733104 52.398622
7053 0.215730 0.593490 0.377759 7053 7171 7171 7171 7171 158319 POINT (-51.15840 -30.04680) ... 520000 True 1876.0 7497.264000 0.703419 4921.763955 4976.649678 4921.763955 9.147621 54.885724
2631 0.155975 0.312952 0.156977 2631 2711 2711 2711 2711 75567 POINT (-42.16840 -11.24310) ... 1100 True 799.0 81.276955 0.654891 4637.520598 6143.049813 4637.520598 250.921536 1505.529215
2392 0.210017 0.443101 0.233084 2392 2472 2472 2472 2472 68866 POINT (-37.24800 -9.53010) ... 6800 True 605.0 169.732820 0.655470 4416.344795 6965.909225 4416.344795 424.927405 2549.564430
7090 0.227100 0.448598 0.221498 7090 7208 7208 7208 7208 158362 POINT (-51.22190 -30.06660) ... 520000 True 1880.0 6483.629000 0.696786 3781.796884 3991.650953 3781.796884 32.355581 194.133487
10373 0.198928 0.347466 0.148539 10373 10506 10506 10506 10506 30212 POINT (-43.36390 -3.74680) ... 13000 True 208.0 4342.514000 0.682775 7285.789439 6987.690287 7285.789439 -49.683192 -298.099152
1160 0.245145 0.484989 0.239844 1160 1206 1206 1206 1206 26327 POINT (-44.87030 -2.94560) ... 1400 True 223.0 26.316500 0.654531 6511.842146 6839.392583 6511.842146 54.591739 327.550437
3650 0.229238 0.720920 0.491682 3650 3738 3738 3738 3738 119618 POINT (-41.30600 -21.69690) ... 64000 True 1079.0 694.848800 0.658906 5651.866435 5525.704993 5651.866435 -21.026907 -126.161442
5106 0.283260 0.522173 0.238913 5106 5209 5209 5209 5209 122710 POINT (-46.43910 -23.49860) ... 1000000 True 1557.0 9452.590000 0.716214 2793.908671 3071.567293 2793.908671 46.276437 277.658621
10410 0.198928 0.303440 0.104513 10410 10543 10543 10543 10543 30262 POINT (-43.36780 -3.74520) ... 13000 True 208.0 3898.486800 0.679869 7280.077391 7004.968598 7280.077391 -45.851466 -275.108793
3431 0.182751 0.412072 0.229321 3431 3519 3519 3519 3519 109853 POINT (-41.12240 -18.85850) ... 3900 True 1031.0 1075.434300 0.661396 6589.065659 6955.693204 6589.065659 61.104591 366.627545
11452 0.226581 0.498686 0.272104 11452 11588 11588 11588 11588 128538 POINT (-46.78620 -23.63220) ... 790000 True 1591.0 17628.180000 0.769712 3566.714823 3926.224662 3566.714823 59.918306 359.509838
7209 0.152324 0.548745 0.396421 7209 7327 7327 7327 7327 163087 POINT (-54.62290 -20.55200) ... 170000 True 1947.0 2695.869900 0.672000 5345.378569 5228.894980 5345.378569 -19.413931 -116.483589
3646 0.283510 0.753653 0.470144 3646 3734 3734 3734 3734 113666 POINT (-41.29760 -21.82510) ... 53000 True 1083.0 1119.155800 0.661682 6283.144605 5815.520418 6283.144605 -77.937364 -467.624187
1118 0.288257 0.313322 0.025065 1118 1157 1157 1157 1157 25207 POINT (-42.90480 -2.83090) ... 2400 True 204.0 63.825000 0.654777 6575.385691 7379.469546 6575.385691 134.013976 804.083855
7308 0.169429 0.373607 0.204179 7308 7426 7426 7426 7426 163862 POINT (-52.32570 -21.46740) ... 1000 True 1937.0 246.005870 0.655969 6861.719516 7772.944882 6861.719516 151.870894 911.225366
1084 0.242649 0.547625 0.304975 1084 1123 1123 1123 1123 33358 POINT (-44.18570 -2.57070) ... 160000 True 226.0 8893.483000 0.712555 4282.390760 4969.809400 4282.390760 114.569773 687.418640
1123 0.288257 0.307187 0.018930 1123 1162 1162 1162 1162 25262 POINT (-42.87760 -2.82160) ... 2700 True 204.0 52.094128 0.654700 6234.064804 7192.790209 6234.064804 159.787568 958.725405
7954 0.226581 0.393617 0.167036 7954 8079 8079 8079 8079 139975 POINT (-46.78380 -23.63250) ... 810000 True 1591.0 17628.180000 0.769712 3492.325760 3847.070618 3492.325760 59.124143 354.744858
9267 0.226581 0.330540 0.103958 9267 9397 9397 9397 9397 126191 POINT (-46.77940 -23.63680) ... 880000 True 1591.0 15496.825000 0.755765 3344.419880 3681.537719 3344.419880 56.186307 337.117839
5120 0.283260 0.411421 0.128161 5120 5223 5223 5223 5223 141021 POINT (-46.45550 -23.49130) ... 840000 True 1557.0 10268.229500 0.721551 2799.367999 3122.826562 2799.367999 53.909760 323.458562
10370 0.198928 0.303440 0.104513 10370 10503 10503 10503 10503 30209 POINT (-43.36460 -3.74380) ... 13000 True 208.0 4281.868700 0.682378 7258.340793 6988.320930 7258.340793 -45.003311 -270.019863
1245 0.296471 0.474380 0.177909 1245 1312 1312 1312 1312 38165 POINT (-42.64030 -5.67830) ... 1000 True 284.0 4.117733 0.654386 7457.128165 7007.490960 7457.128165 -74.939534 -449.637204
10657 0.265696 0.329627 0.063931 10657 10790 10790 10790 10790 145522 POINT (-48.58350 -25.88450) ... 14000 True 1660.0 2103.714600 0.668125 5812.912381 5915.260545 5812.912381 -18.693865 -112.163191
7058 0.215730 0.370289 0.154559 7058 7176 7176 7176 7176 158475 POINT (-51.14610 -30.04890) ... 490000 True 1876.0 6689.484000 0.698133 5128.701281 5188.596228 5128.701281 9.982491 59.894948
1885 0.241445 0.409174 0.167730 1885 1952 1952 1952 1952 60000 POINT (-35.14190 -6.76910) ... 10000 True 471.0 25.928537 0.654529 6882.404577 7114.598117 6882.404577 38.698923 232.193540
10758 0.256105 0.721444 0.465338 10758 10892 10892 10892 10892 162142 POINT (-51.79740 -27.95210) ... 5600 True 1896.0 843.176000 0.659876 5447.153044 5815.623791 5447.153044 61.411791 368.470747
1886 0.241445 0.324615 0.083171 1886 1953 1953 1953 1953 60004 POINT (-35.17060 -6.74740) ... 6700 True 471.0 15.628907 0.654461 7316.738341 7474.501119 7316.738341 26.293796 157.762778
7089 0.227100 0.573706 0.346606 7089 7207 7207 7207 7207 158353 POINT (-51.21330 -30.07460) ... 520000 True 1880.0 6916.410000 0.699618 4340.380895 4561.165997 4340.380895 32.776296 196.657775
3369 0.124122 0.383582 0.259460 3369 3457 3457 3457 3457 98513 POINT (-46.58050 -15.49980) ... 1000 True 858.0 67.325380 0.654800 5963.546403 6178.500306 5963.546403 35.825650 214.953902
3649 0.229238 0.720920 0.491682 3649 3737 3737 3737 3737 113649 POINT (-41.30520 -21.69510) ... 63000 True 1079.0 694.848800 0.658906 5684.543009 5551.445285 5684.543009 -22.182954 -133.097724
3098 0.217249 0.316161 0.098912 3098 3184 3184 3184 3184 92007 POINT (-43.91140 -19.93810) ... 810000 True 809.0 11143.399000 0.727278 4762.058840 5035.640629 4762.058840 45.596965 273.581789
3647 0.283510 0.753653 0.470144 3647 3735 3735 3735 3735 118556 POINT (-41.29570 -21.82350) ... 55000 True 1083.0 1297.589500 0.662850 6244.040624 5779.108511 6244.040624 -77.488686 -464.932113
1880 0.241445 0.409174 0.167730 1880 1947 1947 1947 1947 59287 POINT (-35.14140 -6.76870) ... 9400 True 471.0 25.928537 0.654529 6888.166668 7124.265047 6888.166668 39.349730 236.098379
7092 0.227100 0.346365 0.119265 7092 7210 7210 7210 7210 158454 POINT (-51.22130 -30.07190) ... 510000 True 1880.0 7660.049300 0.704484 4008.226826 4222.743374 4008.226826 31.843053 191.058316
7828 0.265696 0.500870 0.235174 7828 7949 7949 7949 7949 147920 POINT (-48.57870 -25.88490) ... 15000 True 1660.0 1609.585400 0.664892 5715.109038 5852.137706 5715.109038 -19.306294 -115.837762
10431 0.271335 0.708586 0.437252 10431 10564 10564 10564 10564 128601 POINT (-46.76190 -23.64720) ... 960000 True 1493.0 13184.170000 0.740632 2920.966457 3290.466124 2920.966457 61.583278 369.499667
8703 0.217249 0.377574 0.160326 8703 8829 8829 8829 8829 92208 POINT (-43.91510 -19.94390) ... 780000 True 809.0 9282.687000 0.715102 4849.378966 5123.072627 4849.378966 45.615610 273.693662
1359 0.277852 0.356958 0.079106 1359 1426 1426 1426 1426 42895 POINT (-42.77070 -5.08380) ... 350000 True 292.0 3190.135000 0.675234 4374.512562 4083.527823 4374.512562 -48.497456 -290.984739
3411 0.083206 0.409149 0.325943 3411 3499 3499 3499 3499 102372 POINT (-43.39060 -20.77270) ... 1100 True 992.0 19.784330 0.654489 7872.066481 8192.691743 7872.066481 53.437544 320.625262
5114 0.283260 0.733599 0.450339 5114 5217 5217 5217 5217 132219 POINT (-46.43380 -23.49720) ... 1000000 True 1557.0 10078.351000 0.720308 2834.694572 3094.012580 2834.694572 43.219668 259.318008
9288 0.256105 0.721444 0.465338 9288 9418 9418 9418 9418 158982 POINT (-51.79770 -27.95270) ... 5500 True 1896.0 815.357900 0.659694 5440.639279 5809.630459 5440.639279 61.498530 368.991180

49 rows × 66 columns

Plotting the errors

In [58]:
fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Predictions for schools in test set', fontdict={'fontsize': '13', 'fontweight' : '3'})

gdf.plot(column="Predictions", cmap = 'viridis' ,legend=True, ax=ax)
# ctx.add_basemap(ax)

plt.show()
In [59]:
fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Errors in Schools where Prediction is less than 30% connected', fontdict={'fontsize': '13', 'fontweight' : '3'})

#Do errors where ground truth is below .3 and also pred below .3
gdf_Low_pred.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)

plt.show()
In [33]:
vmin, vmax= 0, .7

fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Errors in Schools where Ground Truth is less than 30% connected', fontdict={'fontsize': '13', 'fontweight' : '3'})

#Do errors where ground truth is below .3 and also pred below .3
gdf_GT.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)

plt.show()
In [34]:
vmin, vmax= 0, .7

fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Errors in Schools in Brazil test set', fontdict={'fontsize': '13', 'fontweight' : '3'})

gdf.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)

plt.show()
In [35]:
y = y_test.iloc[:,0].to_numpy()
y_pred = pred

fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'}, 
                 title = 'Comparison between predictions and reality',
                 template = 'plotly_dark')
fig.update_traces(marker=dict(size=3, 
                              color=((abs(y-y_pred) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y.min(), y0=y.min(),
    x1=y.max(), y1=y.max()
)
fig.show()
In [36]:
res_df = pd.DataFrame()
res_df['prediction'] = y_pred
res_df['ground truth'] = y
#res_df['train'] = y_train
res_df['residual'] = (pred - y_test.iloc[:,0].to_numpy())
fig = px.scatter(
    res_df, x='ground truth', y='residual',
    #marginal_y='violin',
    trendline='ols', template = 'plotly_dark',
    title = 'Comparison between residuals and reality'
)
fig.update_traces(marker=dict(size=3, 
                              color=((abs(res_df.residual) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.show()
In [37]:
fig = px.scatter(
    res_df, x='prediction', y='residual',
    #marginal_y='violin',
    trendline='ols', template = 'plotly_dark',
    title = 'Comparison between residuals and predictions'
)
fig.update_traces(marker=dict(size=3, 
                              color=((abs(res_df.residual) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.show()
In [38]:
online_pop = [pred, y_test.iloc[:,0].to_numpy()]
labels = ['predictions', 'reality']
         
fig = ff.create_distplot(online_pop, labels, show_hist = False)
fig.layout.update({'title':'Comparison of distributions of reality and predictions',
                   'title_font_color':'white',
                   'legend_bgcolor':'#545454',
                   'font_color':'white',
                   'plot_bgcolor':'#545454',
                   'paper_bgcolor':'#2a2a2a',
                   'yaxis':{'gridcolor':'#2a2a2a', 'zerolinecolor':'#2a2a2a'},
                   'xaxis':{'gridcolor':'#2a2a2a'}
                   })
fig.show()

Model Interpretation

Examining feature importances

In [20]:
# Make the dataframe
importance = pd.DataFrame(
    {"Feature": X.columns, "Importance": model.feature_importances_}
).sort_values("Importance")
In [21]:
importance
Out[21]:
Feature Importance
0 avg_d_kbps 0.034306
1 avg_u_kbps 0.035971
10 pop_norm 0.059417
2 mean_GHM 0.065541
5 viirs_slope_yr 0.072550
11 mean_NDVI 0.083756
7 viirs_slope_month 0.084703
4 cf_cvg_mean 0.108273
6 cf_cvg_viirs_slope_yr 0.111032
3 avg_rad_mean 0.112357
9 estimate_mau 0.114502
8 viirs_slope_month_cf_cvg 0.117591
In [22]:
import matplotlib.pyplot as plt
fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Feature Importances', fontdict={'fontsize': '13', 'fontweight' : '3'})

(pd.Series(model.feature_importances_, index=X.columns)
   .nsmallest(12).plot(kind='barh'))
Out[22]:
<AxesSubplot:title={'center':'Feature Importances'}>